/*
********************************************************************************
Description:		Select a random sample from SA302 dataset (control group) 
					and merge the VVIV data onto the control and treatment
					individuals

Uses:				"$CQIindata\Randoms List\randomslist_txptype_anon.csv
					"${CQIindata}\CQI_enquiry_incometax.dta"
					"${VVoutdata}\ValidView_1997_2012.dta"
					"${SA302outdata}\SA302_1997_2012.dta"

Saves:				"$mergedata\CouldhaveSample.dta"
		
********************************************************************************
*/



capture program drop constructCouldHaveDatset
program define constructCouldHaveDatset

	syntax, filename(string) [seed(integer 567679667)]

	set more off

	*************************************
	* OPEN LIST OF RANDOM ENQUIRY CASES *
	*************************************

	tempfile randoms

	*import list of random enquiries (deals with the missings from the main CQI file)
	import delimited using "$CQIindata\Randoms List\randomslist_txptype_anon.csv", clear
	rename utr_anon utr_no
	sort utr_no tax_year
	isid utr_no tax_year
	save "`randoms'"



	*************************************************************
	* MERGE IN LIST OF ALL CASES (TO IDENTIFY NON-RANDOM CASES) *
	*************************************************************

	* We only need minimal variables from SA302 because we only need to be able to identify the merges


	* Use SA302 to identify individuals not randomly audited
	********************************************************

	use utr_no tax_year using "${SA302outdata}\SA302_1997_2012.dta", clear


	* Merge in randoms
	******************

	sort utr_no tax_year
	merge 1:1 utr_no tax_year using "`randoms'"



	**************************
	* DROP SOME OBSERVATIONS *
	**************************

	* Drop partnerships and trusts
	******************************

	qui drop if inlist(txp_type, "P", "T")
	drop txp_type

	qui drop if (_merge == 2)

	* Make sure we have the relevant years
	* Note: the end year is 2010 because we are currently selecting a control group and we don't want control groups beyond 2010 (since no audit data past here)
	keep if inrange(tax_year,1997,2010)





	*****************************************
	* RANDOMLY SELECT THE COULD-HAVE SAMPLE *
	*****************************************

	* Create identifier for treated individuals in the current period
	gen byte randauditnow = inlist(_merge,3)
	label var randauditnow "Indicator for selected for audit in the current period"
	drop _merge

	* Randomly select `rawnumtimes'*(number of treated obs) in the year of interest, that were not randomly audited (will cut down later)
	local rawnumtimes = 8 //will drop some people later
	set seed 59454678
	gen double rand = runiform()

	* Identify individuals who were in previous year(s)
	sort utr_no tax_year
	by utr_no (tax_year): gen byte inprevyr = (tax_year == tax_year[_n-1] + 1)
	by utr_no (tax_year): gen byte inprevyr2 = (tax_year == tax_year[_n-2] + 2)

	preserve
	tempfile skeleton

	local firsttime = 1
	local firstyear = 1997 
	local numyears = 2010 - `firstyear' + 1
	forval year=`firstyear'/2010 {

		di as text "Current year `year' (time: `c(current_time)')"
		
		* Restore on second and subsequent loops
		if !`firsttime' restore, preserve

		* Keep only the current tax year
		qui keep if (tax_year == `year')
		
		*CONSTRUCT A UNCONDITIONALLY RANDOM CONTROL GROUP
		*****************************************************		

		* Treatment observations in relevant year
		qui gen byte treatment = 1 if randauditnow == 1
		label var treatment "Indicator for treatment in some period during sample"
		su treatment, meanonly
		local treatSampleSize = r(mean)*r(N)
		
		* Control observations in relevant year (choose `rawnumtimes'*(number of treated obs) of them)
		qui replace rand = 10 if !(randauditnow == 0)
		sort rand
		qui replace treatment = 0 if (_n <= `rawnumtimes'*`treatSampleSize') & (randauditnow == 0)
		qui count if (treatment == 0)
		assert (r(N) == `rawnumtimes'*`treatSampleSize')
		qui keep if (treatment < .)
		drop rand

		* Now create all years for these selected observations
		* (All years means 1997 to 2012, which are the years we want to use SA302 for.
		* This is different to the years we want to use CQI for, which is 1999 to 2010
		* - after which stratification was introduced)
		qui expand 2012 - 1997 + 1
		qui bysort utr_no: replace tax_year = 1997 + _n - 1
		qui replace randauditnow = 0 if (`year' != tax_year)
		
		gen int audityear = `year'
		label var audityear "The year the observation is in the sample for (both treatment and control)"

		gen byte yrssince = tax_year - audityear
		label var yrssince "Years since audityear (both treatment and control)"

		if !`firsttime' append using "`skeleton'"
		qui save "`skeleton'", replace

		local firsttime = 0

	}

	di as text "Outside year loop"	
	restore, not
	sort utr_no tax_year yrssince
	isid utr_no tax_year yrssince

	save "`skeleton'", replace



	*************************
	* MERGE IN THE CQI DATA *
	*************************

	use utr_no tax_year taxpayer_type_ind rejection_code orig_system_ind status_ind random_sig ///
		 enqry_start_date last_interaction settlement_date enqry_closure_date direct_dutyenq_amt  ///
		 stlmnt_class_ind error_reason_code ///
		 using "${CQIoutdata}\CQI_enquiry_incometax.dta", clear

	* Do some consistency checks
	assert (utr_no < .)
	assert (tax_year < .)
	assert inlist(taxpayer_type_ind, "H", "P", "T")
	assert (status_ind != "")
	assert inlist(random_sig, "Y", "N")
	assert (orig_system_ind == "IT")
	drop orig_system_ind

	* Enquiry start date (enqry_start_date) and settlement date (settlement_date) are non-missing if the enquiry is settled but are otherwise usually missing
	assert (enqry_start_date < .) if (status_ind == "S")
	assert (settlement_date < .) if (status_ind == "S")

	* Settlement date is always no earlier than enquiry closure date (if the latter is non-missing)
	assert (settlement_date >= enqry_closure_date) if (enqry_closure_date < .)
	drop enqry_closure_date


	* Keep relevant taxpayers
	*************************

	sort utr_no tax_year
	qui merge 1:1 utr_no tax_year using "`randoms'"
	drop if (_merge == 2)
	qui gen str1 random_sig = "Y" if (_merge == 3)
	qui replace random_sig = "N" if (_merge == 1)
	drop _merge

	* Keep only individual taxpayers
	qui keep if inlist(taxpayer_type_ind, "H")
	drop taxpayer_type_ind

	* Check there are no duplicates
	sort utr_no tax_year
	isid utr_no tax_year

	* Relevant years
	keep if inrange(tax_year,1997,2010)

	* Save CQI data
	tempfile cqitemp
	save `cqitemp'

	* now merge the cqi data into earlier skeleton
	use "`skeleton'", clear

	qui merge m:1 utr_no tax_year using `cqitemp', keep(match master) generate(_merge_in_CQI)
	sort utr_no tax_year yrssince


	***************************
	* MERGE IN THE SA302 DATA *
	***************************

	* _merge == 1: in the skeleton but not in SA302
	* _merge == 2: in SA302 but not in the skeleton. This is individuals not selected for the sample
	* _merge == 3: in the skeleton and in SA302

	* The skeleton includes only a subset of individuals, hence why there will be _merge == 2 observations
	* There may be _merge == 1 observations (in the skeleton but not in the SA302) because of the way the skeleton was created, this is fine
	* We want to keep only those observations that are in the skeleton, hence keep(match master)

	qui merge m:1 utr_no tax_year using "${SA302outdata}\SA302_1997_2012.dta", keep(match master) generate(_merge_in_SA302)
	sort utr_no tax_year yrssince

	* Rename some variables so that the names are shorter
	rename emp_directorship_inc			 empinc
	rename sharesch						 sharesinc
	rename sempllinc					 sempinc
	rename partnership_inc				 pshipinc
	rename foreign_inc					 foreigninc
	rename trust_estates				 trustestatinc
	rename divs							 divinc
	rename intinc_divinc				 intdivinc
	rename pens							 pensinc
	rename inc_tot_amt					 totinc
	rename inc_taxable_amt				 taxableinc
	rename inc_cg_taxable_amt			 taxableinccg
	rename it_plus_cl4_amt				 it_cl4
	rename it_cl4_cgt_amt				 it_cl4_cgt

	
	***************************
	* MERGE IN THE VV/IV DATA *
	***************************

	* The skeleton includes only a subset of individuals, hence why there will be _merge == 2 observations
	* There may be _merge == 1 observations (in the skeleton but not in the VV/IV) because of the way the skeleton was created, this is fine
	* We want to keep only those observations that are in the skeleton, hence keep(match master)

	qui merge m:1 utr_no tax_year using "${VVoutdata}\ValidView_1997_2012.dta", keep(match master) generate(_merge_in_VVIV)
	sort utr_no tax_year yrssince
	
	drop nino_anon 

	qui replace age = . if (age > 120 | age < 16)
	gen byte inLonSE = inlist(gorcode,7,8)
	order inLonSE, after(gorcode)
	
	qui gen int tcn = tcn_tsa
	qui replace tcn = tcn_tsb if (tcn == 0) 
	qui replace tcn = tcn_psa if (tcn == 0) 
	qui replace tcn = tcn_psb if (tcn == 0)
	label variable tcn "Trade classification number (consolidated)"
	order tcn, before(tcn_tsa)
	drop tcn_tsa tcn_tsb tcn_psa tcn_psb



	*******************************
	* MARK SURVIVING OBSERVATIONS *
	*******************************

	*observations in the rectangular data (skeleton) that are also in SA302 contemporaneously
	gen byte survives = (_merge_in_SA302 == 3)




	*******************************************
	* SET MISSING INCOME OBSERVATIONS TO ZERO *
	*******************************************

	* This is just moving sa302source out of the way so that all the income components are next to each other
	order sa302source, before(_merge_in_SA302)

	* Fill in income components and totals with zero
	foreach var of varlist empinc-it_cl4_cgt {
		qui replace `var' = 0 if !survives
	}

	
	**************************
	* TIDY UP FEMALE AND AGE *
	**************************
	
	* We do this (even though it's already been done in the VV_cleanData.do program) to copy information across to !survives cases
	
	* Female
	gegen meanfemale = mean(female), by(utr_no audityear)
	qui replace meanfemale = 0 if (meanfemale > 0 & meanfemale < 0.5)
	qui replace meanfemale = 1 if (meanfemale >= 0.5 & meanfemale < 1)
	qui replace female = meanfemale
	drop meanfemale
	
	* Age
	qui gen int agenew = age
	qui bysort utr_no audityear (tax_year): replace agenew = agenew[_n-1] + (tax_year - tax_year[_n-1]) if (agenew >= .) & (agenew[_n-1] < .) & (_n > 1)
	gen int tax_year_neg = -tax_year
	qui bysort utr_no audityear (tax_year_neg): replace agenew = agenew[_n-1] + (tax_year - tax_year[_n-1]) if (agenew >= .) & (agenew[_n-1] < .) & (_n > 1)
	qui replace age = agenew
	drop agenew tax_year_neg
	gsort utr_no audityear tax_year
	
	
	***********************************
	* SELECT SAMPLE PERIOD EXPLICITLY *
	***********************************

	drop if !inrange(audityear,1999,2009) /*HMRC recommend dropping before 1999 because random enquiries were just starting out before that and aren't consistent. In 2010 enquiry data start being recorded on a new system */
	drop if !inrange(tax_year,1999,2012) 



	*****************
	* SORT AND SAVE *
	*****************

	format %14.0g utr_no
	sort utr_no audityear tax_year
	qui compress
	save "$mergedata/`filename'.dta", replace


	***************************************
	* UPRATE DATA TO COMMON YEAR'S PRICES *
	***************************************

	* Put the financial variables in a common year's prices 
	* (We average across � figures from different years, so these need to be in a common year's prices to be interpretable)

	* Import and save CPI
	import delimited "$pricesoutdata\cpiseries.csv", clear
	sort tax_year
	save "$pricesoutdata\cpiseries.dta", replace

	* Create CPI based in 2012
	su cpi if (tax_year == 2012), meanonly
	gen double cpi2012 = cpi/r(mean)*100
	drop cpi
	tempfile cpifile
	save `cpifile'

	* Merge CPI into sample
	use "$mergedata/`filename'.dta", replace
	merge m:1 tax_year using `cpifile', keep(master match)
	assert (_merge == 3)
	drop _merge

	* Uprate monetary values to 2012 terms
	foreach var of varlist turnover direct_dutyenq_amt empinc-it_cl4_cgt it_aft_a {
		qui gen double `var'_raw = `var'
		order `var'_raw, before(`var')
		qui replace `var' = `var'/cpi2012*100
		rename `var' `var'_cpi
	}
	drop cpi2012

	* Sort and save again
	sort utr_no audityear tax_year
	qui compress
	save "$mergedata/`filename'.dta", replace



	****************************************************
	* KEEP 6x NUMBER OF TREATED OBSERVATIONS EACH YEAR *
	****************************************************

	* By dropping a few more observations we can end up with `numtimes' times the number of treatment individuals as control individuals, so consistent each year
	local numtimes = 6

	set seed `seed'
	gen rand = uniform()

	* Find out how many treated observations there are in year of audit
	egen long numtreated = total(yrssince == 0 & treatment == 1), by(audityear)

	* Mark control observations to drop if there are too many of them relative to the number of treated observations
	bysort audityear yrssince treatment (rand): gen byte todrop = (_n > `numtimes'*numtreated) if (yrssince == 0 & treatment == 0)
	egen byte maxtodrop = max(todrop), by(utr_no audityear)
	qui replace maxtodrop = 0 if (maxtodrop >= .)
	drop if maxtodrop
	drop todrop maxtodrop numtreated rand

	* Check what we've done has worked
	su audityear, meanonly
	forval yr = `=r(min)'/`=r(max)' {
		qui count if (audityear == `yr' & yrssince == 0 & treatment == 1)
		local numtreated = r(N)
		qui count if (audityear == `yr' & yrssince == 0 & treatment == 0)
		assert (r(N) == `numtimes'*`numtreated')
	}

	
	***************************
	* CREATE SOME USEFUL VARS *
	***************************

	* Create an always positive yrssince, that can be used as a factor
	gen byte yrssince20 = yrssince +20
	label var yrssince20 "yrssince + 20"

	* Count number of continuous years in self assessment up to and including audit year
	gegen byte minyrssince = min(yrssince), by(utr_no audityear)
	gegen byte maxyrssince = max(yrssince), by(utr_no audityear)
	gen   byte yrssinceneg = -yrssince
	qui gen byte yrsfiling = 1 if (yrssince == 0)
	qui bysort utr_no audityear (yrssinceneg): replace yrsfiling = yrsfiling[_n-1] + 1 if (yrssinceneg > 0) & (survives)
	gegen byte maxyrsfiling = max(yrsfiling), by(utr_no audityear)
	assert (maxyrsfiling < .)
	drop yrsfiling
	rename maxyrsfiling yrsfiling
	label variable yrsfiling "Number of continuous years of filing up to and including audit year"
	drop yrssinceneg
	
	di as text "var = survives: " _c
	foreach lags of numlist 1(1)5 {
		di as text "l`lags', " _c
		egen byte survives_l`lags' = total(survives*(yrssince == -`lags')), by(utr_no audityear)
	}

	* Identify individuals actually audited in yrssince=0
	egen actuallyaudited = max((yrssince == 0) & (status_ind == "S")), by(utr_no audityear)
	label variable actuallyaudited "Actually audited in yrssince=0 (rather than just selected for audit)"

	*construct tidied industry variables
	run "${dofiles}\convert TCN to broad industry.do"

	*create an id variable
	egen id = group(utr_no audityear)
	xtset id tax_year
	
		
	*****************
	* SORT AND SAVE *
	*****************

	format %14.0g utr_no
	sort utr_no audityear tax_year
	qui compress
	save "$mergedata/`filename'.dta", replace

	
end
